library(tidyverse)
## ── Attaching packages ──────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
library(janeaustenr)
austen_books() %>% 
  unnest_tokens(word, text) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(book, word, sort = TRUE) ->
  book_words 


book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n)) ->
  total_words 

book_words %>% 
  left_join(total_words) ->
  book_words
## Joining, by = "book"
book_words
## # A tibble: 39,708 x 4
##    book              word      n  total
##    <fct>             <chr> <int>  <int>
##  1 Mansfield Park    the    6209 160460
##  2 Mansfield Park    to     5477 160460
##  3 Mansfield Park    and    5439 160460
##  4 Emma              to     5242 160996
##  5 Emma              the    5204 160996
##  6 Emma              and    4897 160996
##  7 Mansfield Park    of     4778 160460
##  8 Pride & Prejudice the    4331 122204
##  9 Emma              of     4293 160996
## 10 Pride & Prejudice to     4163 122204
## # … with 39,698 more rows
book_words %>% 
  ggplot(aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, .0009)+
  facet_wrap(~book, ncol = 2, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

book_words %>% 
  filter(n ==1) %>% 
  nrow()
## [1] 15929
book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), 
         term_frequency = n/total) ->
  freq_by_rank

freq_by_rank
## # A tibble: 39,708 x 6
## # Groups:   book [6]
##    book              word      n  total  rank term_frequency
##    <fct>             <chr> <int>  <int> <int>          <dbl>
##  1 Mansfield Park    the    6209 160460     1         0.0387
##  2 Mansfield Park    to     5477 160460     2         0.0341
##  3 Mansfield Park    and    5439 160460     3         0.0339
##  4 Emma              to     5242 160996     1         0.0326
##  5 Emma              the    5204 160996     2         0.0323
##  6 Emma              and    4897 160996     3         0.0304
##  7 Mansfield Park    of     4778 160460     4         0.0298
##  8 Pride & Prejudice the    4331 122204     1         0.0354
##  9 Emma              of     4293 160996     4         0.0267
## 10 Pride & Prejudice to     4163 122204     2         0.0341
## # … with 39,698 more rows
freq_by_rank %>% 
  ggplot(aes(rank, term_frequency, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

freq_by_rank %>% 
  filter(rank < 500, rank > 10) ->
  rank_subset

lmout <-  lm(log10(term_frequency) ~ log10(rank), data = rank_subset)
broom::tidy(lmout)[c(1,2,5)]
## # A tibble: 2 x 3
##   term        estimate p.value
##   <chr>          <dbl>   <dbl>
## 1 (Intercept)   -0.621       0
## 2 log10(rank)   -1.11        0
freq_by_rank %>% 
  ggplot(aes(rank, term_frequency, color = book)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "red", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

book_words %>% bind_tf_idf(word, book, n) ->
  book_words
book_words
## # A tibble: 39,708 x 7
##    book              word      n  total     tf   idf tf_idf
##    <fct>             <chr> <int>  <int>  <dbl> <dbl>  <dbl>
##  1 Mansfield Park    the    6209 160460 0.0387     0      0
##  2 Mansfield Park    to     5477 160460 0.0341     0      0
##  3 Mansfield Park    and    5439 160460 0.0339     0      0
##  4 Emma              to     5242 160996 0.0326     0      0
##  5 Emma              the    5204 160996 0.0323     0      0
##  6 Emma              and    4897 160996 0.0304     0      0
##  7 Mansfield Park    of     4778 160460 0.0298     0      0
##  8 Pride & Prejudice the    4331 122204 0.0354     0      0
##  9 Emma              of     4293 160996 0.0267     0      0
## 10 Pride & Prejudice to     4163 122204 0.0341     0      0
## # … with 39,698 more rows
book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 39,708 x 6
##    book                word          n      tf   idf  tf_idf
##    <fct>               <chr>     <int>   <dbl> <dbl>   <dbl>
##  1 Sense & Sensibility elinor      623 0.00519  1.79 0.00931
##  2 Sense & Sensibility marianne    492 0.00410  1.79 0.00735
##  3 Mansfield Park      crawford    493 0.00307  1.79 0.00551
##  4 Pride & Prejudice   darcy       374 0.00306  1.79 0.00548
##  5 Persuasion          elliot      254 0.00304  1.79 0.00544
##  6 Emma                emma        786 0.00488  1.10 0.00536
##  7 Northanger Abbey    tilney      196 0.00252  1.79 0.00452
##  8 Emma                weston      389 0.00242  1.79 0.00433
##  9 Pride & Prejudice   bennet      294 0.00241  1.79 0.00431
## 10 Persuasion          wentworth   191 0.00228  1.79 0.00409
## # … with 39,698 more rows
book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(10) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

Physics Texts

library(gutenbergr)
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), meta_fields = "author") 
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
physics %>%
  unnest_tokens(word, text) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word, sort = TRUE) ->
  physics_words 

physics_words  
## # A tibble: 11,849 x 3
##    author              word      n
##    <chr>               <chr> <int>
##  1 Galilei, Galileo    the    3770
##  2 Tesla, Nikola       the    3606
##  3 Huygens, Christiaan the    3553
##  4 Einstein, Albert    the    2995
##  5 Galilei, Galileo    of     2051
##  6 Einstein, Albert    of     2028
##  7 Tesla, Nikola       of     1737
##  8 Huygens, Christiaan of     1708
##  9 Huygens, Christiaan to     1207
## 10 Tesla, Nikola       a      1176
## # … with 11,839 more rows
physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = fct_reorder(word, tf_idf)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan", 
                                            "Tesla, Nikola",
                                            "Einstein, Albert"))) ->
  physics_plot
    physics_plot %>% 
      group_by(author) %>% 
      top_n(15, tf_idf) %>% 
      ungroup() %>%
      mutate(word = reorder(word, tf_idf)) %>%
      ggplot(aes(word, tf_idf, fill = author)) +
      geom_col(show.legend = FALSE) +
      labs(x = NULL, y = "tf-idf") +
      facet_wrap(~author, ncol = 2, scales = "free") +
      coord_flip()

physics %>% 
  filter(str_detect(text, "RC")) %>% 
  select(text)
## # A tibble: 44 x 1
##    text                                                                  
##    <chr>                                                                 
##  1 line RC, parallel and equal to AB, to be a portion of a wave of light,
##  2 represents the partial wave coming from the point A, after the wave RC
##  3 be the propagation of the wave RC which fell on AB, and would be the  
##  4 transparent body; seeing that the wave RC, having come to the aperture
##  5 incident rays. Let there be such a ray RC falling upon the surface    
##  6 CK. Make CO perpendicular to RC, and across the angle KCO adjust OK,  
##  7 the required refraction of the ray RC. The demonstration of this is,  
##  8 explaining ordinary refraction. For the refraction of the ray RC is   
##  9 29. Now as we have found CI the refraction of the ray RC, similarly   
## 10 the ray _r_C is inclined equally with RC, the line C_d_ will          
## # … with 34 more rows
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn", 
                                   "fig", "file", "cg", "cb", "cm",
                               "ab"))

physics_words <- anti_join(physics_words, mystopwords, 
                           by = "word")

plot_physics <- physics_words %>%
  bind_tf_idf(word, author, n) %>%
  mutate(word = str_remove_all(word, "_")) %>%
  group_by(author) %>% 
  top_n(15, tf_idf) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, tf_idf, author)) %>%
  mutate(author = factor(author, levels = c("Galilei, Galileo",
                                            "Huygens, Christiaan",
                                            "Tesla, Nikola",
                                            "Einstein, Albert")))

ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()

# ngrams

austen_books() %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) ->
  austen_bigrams
austen_bigrams
## # A tibble: 725,049 x 2
##    book                bigram         
##    <fct>               <chr>          
##  1 Sense & Sensibility sense and      
##  2 Sense & Sensibility and sensibility
##  3 Sense & Sensibility sensibility by 
##  4 Sense & Sensibility by jane        
##  5 Sense & Sensibility jane austen    
##  6 Sense & Sensibility austen 1811    
##  7 Sense & Sensibility 1811 chapter   
##  8 Sense & Sensibility chapter 1      
##  9 Sense & Sensibility 1 the          
## 10 Sense & Sensibility the family     
## # … with 725,039 more rows
austen_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 211,236 x 2
##    bigram       n
##    <chr>    <int>
##  1 of the    3017
##  2 to be     2787
##  3 in the    2368
##  4 it was    1781
##  5 i am      1545
##  6 she had   1472
##  7 of her    1445
##  8 to the    1387
##  9 she was   1377
## 10 had been  1299
## # … with 211,226 more rows
austen_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")->
  bigrams_separated

bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) ->
  bigrams_filtered

# new bigram counts:
bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)->
  bigram_counts

bigram_counts
## # A tibble: 33,421 x 3
##    word1   word2         n
##    <chr>   <chr>     <int>
##  1 sir     thomas      287
##  2 miss    crawford    215
##  3 captain wentworth   170
##  4 miss    woodhouse   162
##  5 frank   churchill   132
##  6 lady    russell     118
##  7 lady    bertram     114
##  8 sir     walter      113
##  9 miss    fairfax     109
## 10 colonel brandon     108
## # … with 33,411 more rows
bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")->
  bigrams_united

bigrams_united %>% 
  count(bigram, sort = TRUE)
## # A tibble: 33,421 x 2
##    bigram                n
##    <chr>             <int>
##  1 sir thomas          287
##  2 miss crawford       215
##  3 captain wentworth   170
##  4 miss woodhouse      162
##  5 frank churchill     132
##  6 lady russell        118
##  7 lady bertram        114
##  8 sir walter          113
##  9 miss fairfax        109
## 10 colonel brandon     108
## # … with 33,411 more rows
bigrams_filtered %>%
  filter(word2 == "street") %>%
  count(book, word1, sort = TRUE)
## # A tibble: 34 x 3
##    book                word1           n
##    <fct>               <chr>       <int>
##  1 Sense & Sensibility berkeley       16
##  2 Sense & Sensibility harley         16
##  3 Northanger Abbey    pulteney       14
##  4 Northanger Abbey    milsom         11
##  5 Mansfield Park      wimpole        10
##  6 Pride & Prejudice   gracechurch     9
##  7 Sense & Sensibility conduit         6
##  8 Sense & Sensibility bond            5
##  9 Persuasion          milsom          5
## 10 Persuasion          rivers          4
## # … with 24 more rows
#or 
bigrams_united %>% 
  filter(str_detect(bigram,"street")) %>% 
  count(book, bigram, sort = TRUE)
## # A tibble: 57 x 3
##    book                bigram                 n
##    <fct>               <chr>              <int>
##  1 Sense & Sensibility berkeley street       16
##  2 Sense & Sensibility harley street         16
##  3 Northanger Abbey    pulteney street       14
##  4 Northanger Abbey    milsom street         11
##  5 Mansfield Park      wimpole street        10
##  6 Pride & Prejudice   gracechurch street     9
##  7 Sense & Sensibility conduit street         6
##  8 Sense & Sensibility bond street            5
##  9 Persuasion          milsom street          5
## 10 Persuasion          rivers street          4
## # … with 47 more rows
 bigrams_united %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf)) ->
  bigram_tf_idf

bigram_tf_idf
## # A tibble: 36,217 x 6
##    book                bigram                n     tf   idf tf_idf
##    <fct>               <chr>             <int>  <dbl> <dbl>  <dbl>
##  1 Persuasion          captain wentworth   170 0.0299  1.79 0.0535
##  2 Mansfield Park      sir thomas          287 0.0287  1.79 0.0515
##  3 Mansfield Park      miss crawford       215 0.0215  1.79 0.0386
##  4 Persuasion          lady russell        118 0.0207  1.79 0.0371
##  5 Persuasion          sir walter          113 0.0198  1.79 0.0356
##  6 Emma                miss woodhouse      162 0.0170  1.79 0.0305
##  7 Northanger Abbey    miss tilney          82 0.0159  1.79 0.0286
##  8 Sense & Sensibility colonel brandon     108 0.0150  1.79 0.0269
##  9 Emma                frank churchill     132 0.0139  1.79 0.0248
## 10 Pride & Prejudice   lady catherine      100 0.0138  1.79 0.0247
## # … with 36,207 more rows
bigram_tf_idf %>% 
  group_by(book) %>% 
  top_n(10, tf_idf) %>%
  ungroup() %>%
  mutate(bigram = reorder_within(bigram, tf_idf, book)) %>%
  ggplot(aes(bigram, tf_idf, fill = book)) +
    geom_col(show.legend = FALSE) +
    labs(x = NULL, y = "tf-idf") +
    facet_wrap(~book, ncol = 2, scales = "free") +
    coord_flip() +
    scale_x_reordered()

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 1,246 x 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   be      610
##  2 not   to      355
##  3 not   have    327
##  4 not   know    252
##  5 not   a       189
##  6 not   think   176
##  7 not   been    160
##  8 not   the     147
##  9 not   at      129
## 10 not   in      118
## # … with 1,236 more rows
AFINN <- get_sentiments("afinn")
AFINN
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows
bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE) ->
  not_words

not_words
## # A tibble: 245 x 3
##    word2   value     n
##    <chr>   <dbl> <int>
##  1 like        2    99
##  2 help        2    82
##  3 want        1    45
##  4 wish        1    39
##  5 allow       1    36
##  6 care        2    23
##  7 sorry      -1    21
##  8 leave      -1    18
##  9 pretend    -1    18
## 10 worth       2    17
## # … with 235 more rows
not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * value, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment value * number of occurrences") +
  coord_flip()

negation_words <- c("not", "no", "never", "without")

bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE) ->
  negated_words

negated_words %>%
  mutate(contribution = n * value) %>% 
  group_by(word1) %>% 
  top_n(12, abs(contribution)) %>%  
  ungroup()  %>% 
  ggplot(aes(reorder_within(word2,contribution, word1), n * value, 
             fill = n * value > 0)) +
    geom_col(show.legend = FALSE) +
    xlab("Words preceded by negation term") +
    ylab("Sentiment value * Number of Occurrences") +
    coord_flip() +
    facet_wrap(~word1, scales = "free") +
    scale_x_discrete(labels = function(x) str_replace(x,"__.+$", ""))

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
bigram_counts
## # A tibble: 33,421 x 3
##    word1   word2         n
##    <chr>   <chr>     <int>
##  1 sir     thomas      287
##  2 miss    crawford    215
##  3 captain wentworth   170
##  4 miss    woodhouse   162
##  5 frank   churchill   132
##  6 lady    russell     118
##  7 lady    bertram     114
##  8 sir     walter      113
##  9 miss    fairfax     109
## 10 colonel brandon     108
## # … with 33,411 more rows
bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()->
  bigram_graph

bigram_graph
## IGRAPH bd7ff33 DN-- 91 77 -- 
## + attr: name (v/c), n (e/n)
## + edges from bd7ff33 (vertex names):
##  [1] sir     ->thomas     miss    ->crawford   captain ->wentworth 
##  [4] miss    ->woodhouse  frank   ->churchill  lady    ->russell   
##  [7] lady    ->bertram    sir     ->walter     miss    ->fairfax   
## [10] colonel ->brandon    miss    ->bates      lady    ->catherine 
## [13] sir     ->john       jane    ->fairfax    miss    ->tilney    
## [16] lady    ->middleton  miss    ->bingley    thousand->pounds    
## [19] miss    ->dashwood   miss    ->bennet     john    ->knightley 
## [22] miss    ->morland    captain ->benwick    dear    ->miss      
## + ... omitted several edges
library(ggraph)
set.seed(17)

ggraph(bigram_graph, layout = "fr") + #The layout_with_fr() Fruchterman-Reingold layout
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)+ 
   theme_void()

# With repel = TRUE
 ggraph(bigram_graph, layout = "fr") + # layout_with_fr() uses the 
   # Fruchterman-Reingold layout
  geom_edge_link() +
  #geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1, repel = TRUE) + 
   theme_void()

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
data("AssociatedPress", package = "topicmodels")
AssociatedPress
## <<DocumentTermMatrix (documents: 2246, terms: 10473)>>
## Non-/sparse entries: 302031/23220327
## Sparsity           : 99%
## Maximal term length: 18
## Weighting          : term frequency (tf)
ap_td <- tidy(AssociatedPress)
ap_td
## # A tibble: 302,031 x 3
##    document term       count
##       <int> <chr>      <dbl>
##  1        1 adding         1
##  2        1 adult          2
##  3        1 ago            1
##  4        1 alcohol        1
##  5        1 allegedly      1
##  6        1 allen          1
##  7        1 apparently     2
##  8        1 appeared       1
##  9        1 arrested       1
## 10        1 assault        1
## # … with 302,021 more rows
 ap_td %>%
  inner_join(get_sentiments("bing"), by = c(term = "word"))->
  ap_sentiments 

ap_sentiments
## # A tibble: 30,094 x 4
##    document term    count sentiment
##       <int> <chr>   <dbl> <chr>    
##  1        1 assault     1 negative 
##  2        1 complex     1 negative 
##  3        1 death       1 negative 
##  4        1 died        1 negative 
##  5        1 good        2 positive 
##  6        1 illness     1 negative 
##  7        1 killed      2 negative 
##  8        1 like        2 positive 
##  9        1 liked       1 positive 
## 10        1 miracle     1 positive 
## # … with 30,084 more rows
ap_sentiments %>%
  count(sentiment, term, wt = count) %>%
  ungroup() %>%
  filter(n >= 200) %>%
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  ylab("Contribution to sentiment") +
  coord_flip()

data("data_corpus_inaugural", package = "quanteda")
inaug_dfm <- quanteda::dfm(data_corpus_inaugural, verbose = FALSE)
inaug_dfm
## Document-feature matrix of: 58 documents, 9,360 features (91.8% sparse) and 4 docvars.
##                  features
## docs              fellow-citizens  of the senate and house representatives :
##   1789-Washington               1  71 116      1  48     2               2 1
##   1793-Washington               0  11  13      0   2     0               0 1
##   1797-Adams                    3 140 163      1 130     0               2 0
##   1801-Jefferson                2 104 130      0  81     0               0 1
##   1805-Jefferson                0 101 143      0  93     0               0 0
##   1809-Madison                  1  69 104      0  43     0               0 0
##                  features
## docs              among vicissitudes
##   1789-Washington     1            1
##   1793-Washington     0            0
##   1797-Adams          4            0
##   1801-Jefferson      1            0
##   1805-Jefferson      7            0
##   1809-Madison        0            0
## [ reached max_ndoc ... 52 more documents, reached max_nfeat ... 9,350 more features ]
inaug_td <- tidy(inaug_dfm)
inaug_td
## # A tibble: 44,710 x 3
##    document        term            count
##    <chr>           <chr>           <dbl>
##  1 1789-Washington fellow-citizens     1
##  2 1797-Adams      fellow-citizens     3
##  3 1801-Jefferson  fellow-citizens     2
##  4 1809-Madison    fellow-citizens     1
##  5 1813-Madison    fellow-citizens     1
##  6 1817-Monroe     fellow-citizens     5
##  7 1821-Monroe     fellow-citizens     1
##  8 1841-Harrison   fellow-citizens    11
##  9 1845-Polk       fellow-citizens     1
## 10 1849-Taylor     fellow-citizens     1
## # … with 44,700 more rows
inaug_tf_idf <- inaug_td %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))

inaug_tf_idf
## # A tibble: 44,710 x 6
##    document        term        count      tf   idf tf_idf
##    <chr>           <chr>       <dbl>   <dbl> <dbl>  <dbl>
##  1 1793-Washington arrive          1 0.00680  4.06 0.0276
##  2 1793-Washington upbraidings     1 0.00680  4.06 0.0276
##  3 1793-Washington violated        1 0.00680  3.37 0.0229
##  4 1793-Washington willingly       1 0.00680  3.37 0.0229
##  5 1793-Washington incurring       1 0.00680  3.37 0.0229
##  6 1793-Washington previous        1 0.00680  2.96 0.0201
##  7 1793-Washington knowingly       1 0.00680  2.96 0.0201
##  8 1793-Washington injunctions     1 0.00680  2.96 0.0201
##  9 1793-Washington witnesses       1 0.00680  2.96 0.0201
## 10 1793-Washington besides         1 0.00680  2.67 0.0182
## # … with 44,700 more rows
inaug_tf_idf %>% 
  filter(document %in% c("1861-Lincoln", "1933-Roosevelt","1961-Kennedy","2009-Obama")) %>% 
  mutate(term = str_extract(term, "[a-z']+")) %>%
  group_by(document) %>% 
  arrange(desc(tf-idf)) %>% 
  top_n(10) %>% 
  ungroup() %>% 
 
  ggplot(aes(x=reorder_within(term, tf_idf, document), 
             y=tf_idf, fill = document)) +
  geom_col(show.legend = FALSE)+
  facet_wrap(~document, scales = "free") +
  coord_flip() +
  scale_x_discrete(labels = function(x) str_replace(x,"__.+$", ""))
## Selecting by tf_idf

ap_td %>%
  cast_dtm(document, term, count)
## <<DocumentTermMatrix (documents: 2246, terms: 10473)>>
## Non-/sparse entries: 302031/23220327
## Sparsity           : 99%
## Maximal term length: 18
## Weighting          : term frequency (tf)
ap_td %>%
  cast_dfm(document, term, count)
## Document-feature matrix of: 2,246 documents, 10,473 features (98.7% sparse).
##     features
## docs adding adult ago alcohol allegedly allen apparently appeared arrested
##    1      1     2   1       1         1     1          2        1        1
##    2      0     0   0       0         0     0          0        1        0
##    3      0     0   1       0         0     0          0        1        0
##    4      0     0   3       0         0     0          0        0        0
##    5      0     0   0       0         0     0          0        0        0
##    6      0     0   2       0         0     0          0        0        0
##     features
## docs assault
##    1       1
##    2       0
##    3       0
##    4       0
##    5       0
##    6       0
## [ reached max_ndoc ... 2,240 more documents, reached max_nfeat ... 10,463 more features ]
data("acq")
acq
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 50
acq_td <- tidy(acq)
acq_td
## # A tibble: 50 x 16
##    author datetimestamp       description heading id    language origin topics
##    <chr>  <dttm>              <chr>       <chr>   <chr> <chr>    <chr>  <chr> 
##  1 <NA>   1987-02-26 10:18:06 ""          COMPUT… 10    en       Reute… YES   
##  2 <NA>   1987-02-26 10:19:15 ""          OHIO M… 12    en       Reute… YES   
##  3 <NA>   1987-02-26 10:49:56 ""          MCLEAN… 44    en       Reute… YES   
##  4 By Ca… 1987-02-26 10:51:17 ""          CHEMLA… 45    en       Reute… YES   
##  5 <NA>   1987-02-26 11:08:33 ""          <COFAB… 68    en       Reute… YES   
##  6 <NA>   1987-02-26 11:32:37 ""          INVEST… 96    en       Reute… YES   
##  7 By Pa… 1987-02-26 11:43:13 ""          AMERIC… 110   en       Reute… YES   
##  8 <NA>   1987-02-26 11:59:25 ""          HONG K… 125   en       Reute… YES   
##  9 <NA>   1987-02-26 12:01:28 ""          LIEBER… 128   en       Reute… YES   
## 10 <NA>   1987-02-26 12:08:27 ""          GULF A… 134   en       Reute… YES   
## # … with 40 more rows, and 8 more variables: lewissplit <chr>, cgisplit <chr>,
## #   oldid <chr>, places <named list>, people <lgl>, orgs <lgl>,
## #   exchanges <lgl>, text <chr>
acq_tokens <- acq_td %>%
  select(-places) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")

# most common words
acq_tokens %>%
  count(word, sort = TRUE)
## # A tibble: 1,566 x 2
##    word         n
##    <chr>    <int>
##  1 dlrs       100
##  2 pct         70
##  3 mln         65
##  4 company     63
##  5 shares      52
##  6 reuter      50
##  7 stock       46
##  8 offer       34
##  9 share       34
## 10 american    28
## # … with 1,556 more rows
# tf-idf
acq_tokens %>%
  count(id, word) %>%
  bind_tf_idf(word, id, n) %>%
  arrange(desc(tf_idf))
## # A tibble: 2,853 x 6
##    id    word         n     tf   idf tf_idf
##    <chr> <chr>    <int>  <dbl> <dbl>  <dbl>
##  1 186   groupe       2 0.133   3.91  0.522
##  2 128   liebert      3 0.130   3.91  0.510
##  3 474   esselte      5 0.109   3.91  0.425
##  4 371   burdett      6 0.103   3.91  0.405
##  5 442   hazleton     4 0.103   3.91  0.401
##  6 199   circuit      5 0.102   3.91  0.399
##  7 162   suffield     2 0.1     3.91  0.391
##  8 498   west         3 0.1     3.91  0.391
##  9 441   rmj          8 0.121   3.22  0.390
## 10 467   nursery      3 0.0968  3.91  0.379
## # … with 2,843 more rows

Mapping

unloadNamespace("tidytext")
unloadNamespace("ggraph")
unloadNamespace("tidygraph")
unloadNamespace("igraph")
unloadNamespace("tm")
unloadNamespace("gutenbergr")
unloadNamespace("NLP")
unloadNamespace("janeaustenr")
search()
##  [1] ".GlobalEnv"        "package:forcats"   "package:stringr"  
##  [4] "package:dplyr"     "package:purrr"     "package:readr"    
##  [7] "package:tidyr"     "package:tibble"    "package:ggplot2"  
## [10] "package:tidyverse" "package:stats"     "package:graphics" 
## [13] "package:grDevices" "package:utils"     "package:datasets" 
## [16] "package:methods"   "Autoloads"         "package:base"
library(tidyverse)
library(rnaturalearth)
library(rnaturalearthdata)
library(rgeos)
## Loading required package: sp
## rgeos version: 0.5-2, (SVN revision 621)
##  GEOS runtime version: 3.7.2-CAPI-1.11.2 
##  Linking to sp version: 1.3-1 
##  Polygon checking: TRUE
library(ggspatial)
library(sf)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(lwgeom)
## Linking to liblwgeom 3.0.0beta1 r16016, GEOS 3.7.2, PROJ 5.2.0
theme_set(theme_bw())
world <- ne_countries(scale = "medium", returnclass = "sf")
class(world)
## [1] "sf"         "data.frame"
ggplot(data = world) +
  geom_sf() +
xlab("Longitude") + ylab("Latitude") +
ggtitle("World map", 
        subtitle = paste0("(", length(unique(world$name)), " countries)"))

ggplot(data = world) +
    geom_sf(aes(fill = pop_est)) +
    scale_fill_viridis_c(option = "plasma", trans = "sqrt")

ggplot(data = world) +
    geom_sf() +
    coord_sf(xlim = c(-102.15, -74.12), ylim = c(7.65, 33.97), expand = FALSE)

ggplot(data = world) +
    geom_sf(aes(fill = gdp_md_est)) +
    coord_sf(xlim = c(70, 150), ylim = c(15, 55), expand = FALSE) +
    scale_fill_viridis_c(option = "plasma", trans = "sqrt")

world_points<- st_centroid(world)
## Warning in st_centroid.sf(world): st_centroid assumes attributes are constant
## over geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
world_points <- cbind(world, st_coordinates(st_centroid(world$geometry)))
## Warning in st_centroid.sfc(world$geometry): st_centroid does not give correct
## centroids for longitude/latitude data
ggplot(data = world) +
geom_sf() +
geom_text(data= world_points,aes(x=X, y=Y, label=name),
    color = "darkblue", fontface = "bold", check_overlap = FALSE) +
annotate(geom = "text", x = -90, y = 26, label = "Gulf of Mexico", 
    fontface = "italic", color = "grey22", size = 6) +
coord_sf(xlim = c(-97.15, -70.12), ylim = c(7.65, 30.97), expand = FALSE)

ggplot(data = world) + 
  geom_sf(fill= "antiquewhite") + 
  geom_text(data= world_points,aes(x=X, y=Y, label=name), 
            color = "darkblue", fontface = "bold", check_overlap = FALSE) + 
  annotate(geom = "text", x = -90, y = 26, label = "Gulf of Mexico", 
           fontface = "italic", color = "grey22", size = 6) +
  annotation_scale(location = "bl", width_hint = 0.5) + 
  annotation_north_arrow(location = "bl", which_north = "true", 
                         pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"), 
                         style = north_arrow_fancy_orienteering) + 
  coord_sf(xlim = c(-102.15, -74.12), ylim = c(7.65, 33.97), expand = FALSE) +
  xlab("Longitude") + ylab("Latitude") + 
  ggtitle("Map of the Gulf of Mexico and the Caribbean Sea") + 
  theme(panel.grid.major = element_line(color = gray(.5), 
                                        linetype = "dashed", size = 0.5), 
        panel.background = element_rect(fill = "aliceblue"))
## Scale on map varies by more than 10%, scale bar may be inaccurate

sites <- data.frame(longitude = c(-80.144005, -80.109), latitude = c(26.479005, 
    26.83))
sites <- st_as_sf(sites, coords = c("longitude", "latitude"), 
    crs = 4326, agr = "constant")
ggplot(data = world) +
    geom_sf() +
    geom_sf(data = sites, size = 4, shape = 23, fill = "darkred") +
    coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)

ggplot(data = world) +
    geom_sf() +
    geom_sf(data = sites, size = 4, shape = 23, fill = "darkred") +
    coord_sf(xlim = c(-82, -78), ylim = c(24.5, 28), expand = FALSE)

library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
states <- st_as_sf(map("state", plot = FALSE, fill = TRUE))
head(states)
## Simple feature collection with 6 features and 1 field
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.3834 ymin: 30.24071 xmax: -71.78015 ymax: 42.04937
## CRS:            EPSG:4326
##            ID                           geom
## 1     alabama MULTIPOLYGON (((-87.46201 3...
## 2     arizona MULTIPOLYGON (((-114.6374 3...
## 3    arkansas MULTIPOLYGON (((-94.05103 3...
## 4  california MULTIPOLYGON (((-120.006 42...
## 5    colorado MULTIPOLYGON (((-102.0552 4...
## 6 connecticut MULTIPOLYGON (((-73.49902 4...
states <- cbind(states, st_coordinates(st_centroid(states)))
## Warning in st_centroid.sf(states): st_centroid assumes attributes are constant
## over geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
states$ID <- stringr::str_to_title(states$ID)
ggplot(data = world) +
    geom_sf() +
    geom_sf(data = states, fill = NA) + 
    geom_text(data = states, aes(X, Y, label = ID), size = 5) +
    coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)

counties <- st_as_sf(map("county", plot = FALSE, fill = TRUE))
counties <- subset(counties, grepl("florida", counties$ID))
counties$area <- as.numeric(st_area(counties))
head(counties)
## Simple feature collection with 6 features and 2 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -85.98951 ymin: 25.94926 xmax: -80.08804 ymax: 30.57303
## CRS:            EPSG:4326
##                   ID                           geom       area
## 290  florida,alachua MULTIPOLYGON (((-82.66062 2... 2498863359
## 291    florida,baker MULTIPOLYGON (((-82.04182 3... 1542466064
## 292      florida,bay MULTIPOLYGON (((-85.40509 3... 1946587533
## 293 florida,bradford MULTIPOLYGON (((-82.4257 29...  818898090
## 294  florida,brevard MULTIPOLYGON (((-80.94747 2... 2189682999
## 295  florida,broward MULTIPOLYGON (((-80.89018 2... 3167386973
ggplot(data = world) +
    geom_sf() +
    geom_sf(data = counties, aes(fill = area)) +
    scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
    coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)

flcities <- data.frame(state = rep("Florida", 5), city = c("Miami", 
    "Tampa", "Orlando", "Jacksonville", "Sarasota"), lat = c(25.7616798, 
    27.950575, 28.5383355, 30.3321838, 27.3364347), lng = c(-80.1917902, 
    -82.4571776, -81.3792365, -81.655651, -82.5306527))

(flcities <- st_as_sf(flcities, coords = c("lng", "lat"), remove = FALSE, 
    crs = 4326, agr = "constant"))
## Simple feature collection with 5 features and 4 fields
## Attribute-geometry relationship: 4 constant, 0 aggregate, 0 identity
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -82.53065 ymin: 25.76168 xmax: -80.19179 ymax: 30.33218
## CRS:            EPSG:4326
##     state         city      lat       lng                   geometry
## 1 Florida        Miami 25.76168 -80.19179 POINT (-80.19179 25.76168)
## 2 Florida        Tampa 27.95058 -82.45718 POINT (-82.45718 27.95058)
## 3 Florida      Orlando 28.53834 -81.37924 POINT (-81.37924 28.53834)
## 4 Florida Jacksonville 30.33218 -81.65565 POINT (-81.65565 30.33218)
## 5 Florida     Sarasota 27.33643 -82.53065 POINT (-82.53065 27.33643)
ggplot(data = world) +
geom_sf() +
geom_sf(data = counties, fill = NA, color = gray(.5)) +
geom_sf(data = flcities) +
geom_text(data = flcities, aes(x = lng, y = lat, label = city), 
    size = 3.9, col = "black", fontface = "bold") +
coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)

library("ggrepel")
ggplot(data = world) +
    geom_sf() +
    geom_sf(data = counties, fill = NA, color = gray(.5)) +
    geom_sf(data = flcities) +
    geom_text_repel(data = flcities, aes(x = lng, y = lat, label = city), 
        fontface = "bold", nudge_x = c(1, -1.5, 2, 2, -1), nudge_y = c(0.25, 
            -0.25, 0.5, 0.5, -0.5)) +
    coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)

states$nudge_y <- -1
states$nudge_y[states$ID == "Florida"] <- 1.1
states$nudge_y[states$ID == "South Carolina"] <- .1
ggplot(data = world) +
    geom_sf(fill = "antiquewhite1") +
    geom_sf(data = counties, aes(fill = area)) +
    geom_sf(data = states, fill = NA) + 
    geom_sf(data = sites, size = 4, shape = 23, fill = "darkred") +
    geom_sf(data = flcities) +
    geom_text_repel(data = flcities, aes(x = lng, y = lat, label = city), 
        fontface = "bold", nudge_x = c(1, -1.5, 2, 2, -1), nudge_y = c(0.25, 
            -0.25, 0.5, 0.5, -0.5)) +
    geom_label(data = states, aes(X, Y, label = ID), size = 5, fontface = "bold", 
        nudge_y = states$nudge_y) +
    scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
    annotation_scale(location = "bl", width_hint = 0.4) +
    annotation_north_arrow(location = "bl", which_north = "true", 
        pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"),
        style = north_arrow_fancy_orienteering) +
    coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE) +
    xlab("Longitude") + ylab("Latitude") +
    ggtitle("Observation Sites", subtitle = "(2 sites in Palm Beach County, Florida)") +
    theme(panel.grid.major = element_line(color = gray(0.5), linetype = "dashed", 
        size = 0.5), panel.background = element_rect(fill = "aliceblue"))

usmap

unloadNamespace("maps")
library(usmap)
data(statepop)
states <- read_csv("./data/state_geo_data.csv", col_types = cols(OID = col_character()))
url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
df <- tibble(file_names = c("time_series_covid19_confirmed_US.csv",
                    "time_series_covid19_deaths_US.csv"))
df %>% 
  mutate(url = str_c(url_in, file_names, sep =""),
         data = map(url, ~read_csv(., col_types = cols(FIPS = col_factor()), na = "")),
         case_type = as.factor(str_extract(file_names,"[:alpha:]*_[gU][:alpha:]*"))) %>%
  select(case_type, data) ->
  df

 df$vars <- map(df$data,  names)
#create a helper function
    fix_names <- function(df, pattern, repl){
     stopifnot(is.data.frame(df), is.character(pattern), is.character(repl))
      names(df) <- str_replace_all(names(df), pattern, repl)
      return(df)
    }
df %>% 
  mutate(data = map(data, ~ fix_names(., "Admin2", "County")),
         data = map(data, ~ fix_names(., "Long_", "Long")),
         data = map_if(data, !str_detect(df$case_type,"deaths_US"),
                       ~ mutate(., Population = 0)),
         data = map(data, ~filter(., !is.na(FIPS))),
         data = map(data, ~ mutate(., cFIPS = str_extract(UID,".{5}$"),
                                   sFIPS = str_extract(cFIPS, "^..")))
         )->df

df$vars <- map(df$data,  names)
df %>% 
  mutate(data = map(data, ~ pivot_longer(data = ., cols = contains("/"), 
                                         names_to = "Date", 
                                         values_to = "Daily_Total") ))-> 
df

#helper function for Date Change
fix_date_mdy <- function(df,var_name){
  stopifnot(is.data.frame(df), is.character(var_name))
  df[[var_name]]<- mdy(df[[var_name]])
  return(df)
}
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
df %>%
  mutate(data = map(data, ~fix_date_mdy(., "Date")))-> 
df
 df %>% 
  unnest(cols=data) %>% 
  ungroup() ->
  df_all
   # rm(df)
df_all <- select(df_all, -vars)
df_all %>% 
  group_by(UID) %>% 
  filter(!is.na(cFIPS), County !="Unassigned", !str_detect(County, "^Out of ")) %>% 
  mutate(Population = max(Population),
         Daily_Total_percap = Daily_Total/Population)->
  df_all
#write_csv(df_all, "./data/covid_us_data.csv")
plot_usmap(regions = "states") + 
  labs(title = "US States",
       subtitle = "This is a blank map of the states of the United States.") + 
  theme(panel.background = element_rect(color = "black", fill = "lightblue"))

plot_usmap(include = .south_atlantic, labels = TRUE) + 
  labs(title = "US South Atlantic States",
       subtitle = "This is a blank map of the states in the South Atlantic Census Region") + 
  theme(panel.background = element_rect(color = "black", fill = "lightblue"))

plot_usmap(regions = "counties") + 
  labs(title = "US Counties",
       subtitle = "This is a blank map of the counties of the United States.") + 
  theme(panel.background = element_rect(color = "black", fill = "lightblue"))

plot_usmap("counties", include = c("VA", "MD", "DC")) +
  labs(title = "Custom DMV Area",
       subtitle = "These are the counties in MD, DC< anD VA.")+ 
  theme(panel.background = element_rect(color = "blue", fill = "lightblue"))

dmv_county_fips <- c("11001","24003","24009","24017","24021","24027","24031",
                     "24033","51013","51059","51061","51107","51153","51179",
                     "51510","51600","51610","51630","51683","51685")
plot_usmap("counties", include = dmv_county_fips) +
  labs(title = "Greater DMV Area",
       subtitle = "These are the Counties/Cities in the Greater DMV area.")+ 
  theme(panel.background = element_rect(color = "blue", fill = "lightblue"))

df_all %>% 
  filter(Date == max(Date), Population >0, !is.na(sFIPS)) %>% 
  group_by(sFIPS, case_type, Date) %>% 
  summarise(Current_Total = sum(Daily_Total),
            Current_Total_percap = sum(Daily_Total)/sum(Population))->
  state_totals
state_totals %>% 
  filter(case_type =="confirmed_US") %>% 
  rename(fips = sFIPS)-> 
  state_cases
plot_usmap(data = state_cases, values = "Current_Total", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Confirmed Cases", label = scales::comma)+
  labs(title = "US States",
       subtitle = paste0("Total Cases by State as of ", 
                         max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

state_totals %>% 
  filter(case_type =="deaths_US") %>% 
  rename(fips = sFIPS)-> 
  state_cases
plot_usmap(data = state_cases, values = "Current_Total", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Total Deaths", label = scales::comma)+
  labs(title = "US States",
       subtitle = paste0("Total Deaths by State as of ", max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

state_totals %>% 
  filter(case_type =="deaths_US") %>% 
  rename(fips = sFIPS)-> 
  state_cases
plot_usmap(data = state_cases, values = "Current_Total", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Total Deaths", label = scales::comma)+
  labs(title = "US States",
       subtitle = paste0("Total Deaths by State as of ", max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

state_totals %>% 
  filter(case_type =="deaths_US") %>% 
  rename(fips = sFIPS)-> 
  state_cases
plot_usmap(data = state_cases, values = "Current_Total_percap", 
           color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Total Deaths Per Capita", 
                        label = scales::comma)+
  labs(title = "US States",
       subtitle = paste0("Total Deaths per Capita by State as of ", 
                         ... = max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "right")

state_totals %>% 
  filter(case_type =="deaths_US") %>% 
  mutate(Current_Total10 = log10(Current_Total),
         Current_Total_percap10 = log10(Current_Total_percap)) %>% 
  rename(fips = sFIPS)-> 
  state_cases
plot_usmap(data = state_cases, values = "Current_Total_percap10", 
           color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Total Deaths per Capita (Log 10)", 
                        label = scales::comma)+
  labs(title = "US States",
       subtitle = paste0("Log(10) of Total Deaths per Capita by State as of ", 
                         ... = max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "right")

state_cases %>% 
  left_join(states, by = c("fips"="STATE")) %>% 
  mutate(Current_Total_perarea = Current_Total/AREALAND)->
  state_casesa

plot_usmap(data = state_casesa, values = "Current_Total_perarea", 
           color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Total Deaths per unit Area (Log 10)", 
                        label = scales::comma)+
  labs(title = "US States",
       subtitle = paste0("Total Deaths per unit area by State as of ", 
                         ... = max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "right")

df_all %>% 
  filter(Date == max(Date), Population >0, !is.na(cFIPS)) %>% 
  group_by(cFIPS, case_type, Date) %>% 
  summarise(Current_Total = sum(Daily_Total),
            Current_Total_percap = sum(Daily_Total)/sum(Population))->
  county_totals
county_totals %>% 
  filter(case_type =="confirmed_US") %>% 
  mutate(Current_Total_log2 = log2(Current_Total)) %>% 
  rename(fips = cFIPS)-> 
  county_cases
plot_usmap(data = county_cases, include = c("NY","NJ"),
           values = "Current_Total", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Confirmed Cases", label = scales::comma)+
  labs(title = "US States",
       subtitle = paste0("Total Cases by County as of ", 
                         max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

plot_usmap(data = county_cases, include = dmv_county_fips,
           values = "Current_Total", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Confirmed Cases", label = scales::comma)+
  labs(title = "DMV Region",
       subtitle = paste0("Total Cases by County/City as of ", 
                         max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

county_totals %>% 
  filter(case_type =="deaths_US") %>% 
  rename(fips = cFIPS)-> 
  county_deaths
plot_usmap(data = county_deaths, include = dmv_county_fips,
           values = "Current_Total", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Confirmed Deaths", label = scales::comma)+
  labs(title = "DMV Region",
       subtitle = paste0("Total Deaths by County/City as of ", 
                         max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

plot_usmap(data = county_cases, include = dmv_county_fips,
           values = "Current_Total_percap", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Confirmed Cases Per Capita", label = scales::comma)+
  labs(title = "DMV Region",
       subtitle = paste0("Total Cases per Capita by County/City as of ", 
                         max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

county_totals %>% 
  filter(case_type =="deaths_US") %>% 
  rename(fips = cFIPS)-> 
  county_deaths
plot_usmap(data = county_deaths, include = dmv_county_fips,
           values = "Current_Total_percap", color = "blue") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Confirmed Deaths per Capita", label = scales::comma)+
  labs(title = "DMV Region",
       subtitle = paste0("Total Deaths per Capita by County/City as of ", 
                         max(state_cases$Date))) + 
  theme(panel.background = element_rect(color = "black", fill = "white")) +
  theme(legend.position = "top")

Census Data

library(tidycensus)
library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
## 
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
## 
##     plot
library(sf)
library(viridis) #a color palette
## Loading required package: viridisLite
options(tigris_use_cache = TRUE)
baltimore <- get_acs(state = "MD", county = "Baltimore City", geography = "tract", 
                  variables = "B19013_001", geometry = TRUE, key = Sys.getenv("CENSUS_API_KEY"))
## Getting data from the 2014-2018 5-year ACS
#head(baltimore)
baltimore %>% 
ggplot(aes(fill = estimate)) + 
  geom_sf(color = NA) + 
  coord_sf(crs = 26911) + 
  scale_fill_viridis_c(option = "magma") 

racevars <- c(White = "P005003", 
              Black = "P005004", 
              Asian = "P005006", 
              Hispanic = "P004003")
baltimore_race <- get_decennial(state = "MD", county = "Baltimore City", 
                                geography = "tract", 
                                variables = racevars, geometry = TRUE, 
                                summary_var = "P001001",
                                key = Sys.getenv("CENSUS_API_KEY"))
## Getting data from the 2010 decennial Census
baltimore_race %>%
  mutate(pct = 100 * (value / summary_value)) %>%
  ggplot(aes(fill = pct)) +
  facet_wrap(~variable) +
  geom_sf(color = NA) +
  coord_sf(crs = 26915) + 
  scale_fill_viridis_c()

dc_median <- get_acs(state = "DC", county = "District of Columbia", geography = "tract", 
                  variables = "B19013_001", geometry = TRUE, key = Sys.getenv("CENSUS_API_KEY"))
## Getting data from the 2014-2018 5-year ACS
dc_median %>% 
  mutate(CENSUS_TRACT = str_sub(GEOID,6,11)) ->
  dc_median
dc_median %>% 
ggplot(aes(fill = estimate)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c() 

crime <- read_csv("./data/dc-crimes-search-results.csv", col_types = cols(DISTRICT = col_factor(),
                                                                         WARD = col_factor(),
                                                                         PSA = col_factor(),
                                                                         METHOD= col_factor(),
                                                                         CENSUS_TRACT = col_character()
                                                                         ))
crime %>% 
  filter(METHOD != "others") %>% 
  group_by(CENSUS_TRACT, METHOD) %>%
  count() %>% 
  left_join(dc_median) %>%
  ggplot(aes(fill = n)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c() +
    facet_wrap(~METHOD) 
## Joining, by = "CENSUS_TRACT"

wards <- read_sf("./data/Ward_from_2012")
wards %>% 
  mutate(WARD = parse_factor(as.character(WARD))) ->
  wards
   # head(wards)
crime %>% 
  filter(METHOD != "others") %>% 
  group_by(WARD, METHOD) %>%
  count() %>% 
  left_join(wards) %>% 
  ggplot(aes(fill = n)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c() +
    facet_wrap(~METHOD) 
## Joining, by = "WARD"
## Warning: Column `WARD` joining factors with different levels, coercing to
## character vector

crime %>% 
  select(CENSUS_TRACT, WARD) %>% 
  group_by(CENSUS_TRACT) %>% 
  unique() %>% 
  count() %>% 
  filter(n>1) %>% 
  arrange(desc(n))
## # A tibble: 20 x 2
## # Groups:   CENSUS_TRACT [20]
##    CENSUS_TRACT     n
##    <chr>        <int>
##  1 <NA>             4
##  2 004801           3
##  3 000300           2
##  4 001401           2
##  5 001402           2
##  6 002302           2
##  7 003400           2
##  8 004201           2
##  9 004300           2
## 10 004600           2
## 11 004901           2
## 12 004902           2
## 13 005800           2
## 14 005900           2
## 15 006202           2
## 16 006804           2
## 17 007601           2
## 18 007604           2
## 19 007605           2
## 20 010600           2
au_latlong <- data.frame(longitude = -77.0888, latitude = 38.9375)
au_latlong<- st_as_sf(au_latlong, coords = c("longitude", "latitude"), 
    crs = 4326, agr = "constant")
crime %>% 
  filter(METHOD != "others") %>% 
  group_by(WARD, METHOD) %>%
  count() %>% 
  left_join(wards) %>% 
  ggplot(aes(fill = n)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c() +
    facet_wrap(~METHOD) +
   geom_sf(data = au_latlong, size = 4, shape = 23, fill = "red")
## Joining, by = "WARD"
## Warning: Column `WARD` joining factors with different levels, coercing to
## character vector
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

dc_median %>% 
ggplot(aes(fill = estimate)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c() +
   geom_sf(data = au_latlong, size = 4, shape = 23, fill = "red")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.